Load raw data

if(!file.exists('./../Data/Voineagu.RData')){
  load('~/MSc/Dissertation/Data/RDatas/max_var_between/LumiBatch_Voineagu.RData')
    
  # Remove cerebellum samples
  LumiBatch_v = LumiBatch_v[,colnames(LumiBatch_v) %in%
                              pData(LumiBatch_v)$GSM[pData(LumiBatch_v)$Cortex.area!='cerebellum']]
  pData(LumiBatch_v) = pData(LumiBatch_v)[pData(LumiBatch_v)$Cortex.area != 'cerebellum',]
  datExpr = exprs(LumiBatch_v)
  datMeta = pData(LumiBatch_v)
  
  # Perform Differential Expression Analysis
  SAM_fit = SAM(datExpr, as.factor(datMeta$Disease.status), resp.type = 'Two class unpaired', 
                geneid = rownames(datExpr), random.seed = 1234, logged2 = FALSE, fdr.output = 1)
  genes_up = SAM_fit$siggenes.table$genes.up %>% data.frame
  genes_down = SAM_fit$siggenes.table$genes.lo %>% data.frame
  DE_info = genes_up %>% bind_rows(genes_down)
  
  # Load SFARI information
  SFARI_genes = read_csv('./../../Gandal/RNAseq/working_data/SFARI_genes_01-15-2019.csv')
  ilmn_gene_map = mapIds(illuminaHumanv4.db, key=rownames(datExpr), 
                         column=c('SYMBOL'), keytype='PROBEID', multiVals='filter')
  SFARI_genes = data.frame('ID'=names(ilmn_gene_map), 'gene-symbol'=unname(ilmn_gene_map)) %>%
                 right_join(SFARI_genes, by=c('gene.symbol'='gene-symbol'))
  
  # Add neuronal annotations
  GO_annotations = read.csv('./../../Gandal/RNAseq/working_data/genes_GO_annotations.csv')
  GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>% 
                distinct(ensembl_gene_id) %>%
                mutate('Neuronal' = 1)
  
  mart = useMart(biomart='ENSEMBL_MART_ENSEMBL', dataset='hsapiens_gene_ensembl',
                 host='feb2014.archive.ensembl.org')
  ensembl_gene_map = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('ensembl_gene_id'), 
                     values=GO_neuronal$ensembl_gene_id, mart=mart)
  ensembl_ilmn_gene_map = data.frame('ID' = names(ilmn_gene_map), 'gene.symbol'=unname(ilmn_gene_map)) %>% 
                          full_join(ensembl_gene_map, by=c('gene.symbol'='hgnc_symbol'))
  
  GO_neuronal = GO_neuronal %>% left_join(ensembl_ilmn_gene_map, by='ensembl_gene_id') %>%
                filter(!is.na(ID))
  
  
  save(datExpr, datMeta, GO_neuronal, SFARI_genes, DE_info, LumiBatch_v,
       file='./../Data/Voineagu.RData')
  
  rm(gpl, gse, GO_annotations, mart, ensembl_gene_map, ilmn_gene_map, gene_names,
     genes_up, genes_down, SAM_fit, ensembl_ilmn_gene_map)
  
} else load('./../Data/Voineagu.RData')

datExpr_backup = datExpr

maxFC = max(as.numeric(DE_info$Fold.Change[DE_info$Fold.Change!='Inf']))
DE_info = DE_info %>% mutate('logFC'=ifelse(Fold.Change=='Inf', log2(maxFC+1), 
                                            log2(as.numeric(Fold.Change)+1)),
                             'ID' = Gene.Name)

datMeta = datMeta %>% mutate('Diagnosis_' = Disease.status)

rm(maxFC)

Number of genes and samples:

dim(datExpr)
## [1] 15809    58


Gene count by SFARI score of remaining genes:

table(SFARI_genes$`gene-score`)
## 
##   1   2   3   4   5   6 
##  25  62 195 454 175  25


Gene count by Diagnosis and Brain lobe:

t(table(datMeta$Cortex.area, datMeta$Diagnosis_))
##          
##           frontal temporal
##   autism       16       13
##   control      16       13


Relation between SFARI scores and Neuronal functional annotation:

Neuronal_SFARI = data.frame('ID'=rownames(datExpr), 'Neuronal'=rownames(datExpr) %in% GO_neuronal$ID) %>%
                 left_join(SFARI_genes, by='ID')

tbl = table(Neuronal_SFARI$`gene-score`, Neuronal_SFARI$Neuronal, useNA='ifany') %>% t %>% as.data.frame.matrix
rownames(tbl) = c('Non-Neuronal','Neuronal')
tbl
##               1  2   3   4   5  6    NA
## Non-Neuronal 12 40 136 343 128 20 14005
## Neuronal      8 16  42  72  39  4   945
tbl = round(sweep(tbl, 2, colSums(tbl), `/`)*100, 2)
tbl
##               1     2    3     4     5     6    NA
## Non-Neuronal 60 71.43 76.4 82.65 76.65 83.33 93.68
## Neuronal     40 28.57 23.6 17.35 23.35 16.67  6.32
rm(Neuronal_SFARI, tbl)



Boxplots for raw data

Mean and Standard Deviation by score

  • The higher the SFARI score, the higher the mean and the standard deviation but the relation is not as clear as in Gandal’s dataset
make_ASD_vs_CTL_df = function(datExpr){
  datExpr_ASD = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='autism'))
  datExpr_CTL = datExpr %>% data.frame %>% dplyr::select(which(datMeta$Diagnosis_=='control'))
  
  ASD_vs_CTL = data.frame('ID'=as.character(rownames(datExpr)),
                          'mean' = rowMeans(datExpr), 'sd' = apply(datExpr, 1, sd),
                          'mean_ASD'=rowMeans(datExpr_ASD), 'mean_CTL'=rowMeans(datExpr_CTL),
                          'sd_ASD'=apply(datExpr_ASD,1,sd), 'sd_CTL'=apply(datExpr_CTL,1,sd)) %>%
               mutate('mean_diff'=mean_ASD-mean_CTL, 'sd_diff'=sd_ASD-sd_CTL) %>%
               left_join(SFARI_genes, by='ID') %>%
               dplyr::select(ID, mean, sd, mean_ASD, mean_CTL, mean_diff, sd_ASD, sd_CTL, sd_diff, `gene-score`) %>%
               mutate('Neuronal'=ifelse(ID %in% GO_neuronal$ID, 'Neuronal', 'Non-Neuronal'))
  
  ASD_vs_CTL_SFARI = ASD_vs_CTL %>% filter(!is.na(`gene-score`)) %>% 
                     mutate(Group = as.character(`gene-score`)) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_Neuro = ASD_vs_CTL %>% mutate(Group = Neuronal) %>% dplyr::select(-c(Neuronal,`gene-score`))
  ASD_vs_CTL_all = ASD_vs_CTL %>% mutate(Group = 'All') %>% dplyr::select(-c(Neuronal,`gene-score`))
  
  ASD_vs_CTL_together = bind_rows(ASD_vs_CTL_SFARI, ASD_vs_CTL_Neuro, ASD_vs_CTL_all) %>% 
                        mutate(Group = ordered(Group, levels=c('1','2','3','4','5','6',
                                                               'Neuronal','Non-Neuronal','All'))) %>%
                        left_join(DE_info, by='ID')
  
  return(ASD_vs_CTL_together)
}

ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() + 
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

There is heterocedasticity in the data with the variance increasing at a lower rate than the mean. The relation doesn’t seem to be completely linear, but close to.

ggplot(ASD_vs_CTL, aes(mean+1, sd+1)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
         scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + 
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

  • There doesn’t seem to be a clear pattern between difference in mean and SFARI score?

  • The higher the SFARI score, the smaller the log Fold Change between Autism and Control except for scores 4 and 6

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(logFC), fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Boxplots for log2 transformed data

# Variance stabilisation
LumiBatch_v = lumiT(LumiBatch_v, method = 'log2', ifPlot = TRUE)
## Perform log2 transformation ...
datExpr = exprs(LumiBatch_v)

# Perform Differential Expression Analysis
SAM_fit = SAM(datExpr, as.factor(datMeta$Disease.status), resp.type = 'Two class unpaired', 
              geneid = rownames(datExpr), random.seed = 1234, logged2 = TRUE, fdr.output = 1)
## perm= 1
## perm= 2
## perm= 3
## perm= 4
## perm= 5
## perm= 6
## perm= 7
## perm= 8
## perm= 9
## perm= 10
## perm= 11
## perm= 12
## perm= 13
## perm= 14
## perm= 15
## perm= 16
## perm= 17
## perm= 18
## perm= 19
## perm= 20
## perm= 21
## perm= 22
## perm= 23
## perm= 24
## perm= 25
## perm= 26
## perm= 27
## perm= 28
## perm= 29
## perm= 30
## perm= 31
## perm= 32
## perm= 33
## perm= 34
## perm= 35
## perm= 36
## perm= 37
## perm= 38
## perm= 39
## perm= 40
## perm= 41
## perm= 42
## perm= 43
## perm= 44
## perm= 45
## perm= 46
## perm= 47
## perm= 48
## perm= 49
## perm= 50
## perm= 51
## perm= 52
## perm= 53
## perm= 54
## perm= 55
## perm= 56
## perm= 57
## perm= 58
## perm= 59
## perm= 60
## perm= 61
## perm= 62
## perm= 63
## perm= 64
## perm= 65
## perm= 66
## perm= 67
## perm= 68
## perm= 69
## perm= 70
## perm= 71
## perm= 72
## perm= 73
## perm= 74
## perm= 75
## perm= 76
## perm= 77
## perm= 78
## perm= 79
## perm= 80
## perm= 81
## perm= 82
## perm= 83
## perm= 84
## perm= 85
## perm= 86
## perm= 87
## perm= 88
## perm= 89
## perm= 90
## perm= 91
## perm= 92
## perm= 93
## perm= 94
## perm= 95
## perm= 96
## perm= 97
## perm= 98
## perm= 99
## perm= 100
## 
## Computing delta table
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
genes_up = SAM_fit$siggenes.table$genes.up %>% data.frame
genes_down = SAM_fit$siggenes.table$genes.lo %>% data.frame
DE_info = genes_up %>% bind_rows(genes_down)

maxFC = max(as.numeric(DE_info$Fold.Change[DE_info$Fold.Change!='Inf']))
DE_info = DE_info %>% mutate('logFC'=ifelse(Fold.Change=='Inf', log2(maxFC+1), 
                                            log2(as.numeric(Fold.Change)+1)),
                             'ID' = Gene.Name)

rm(SAM_fit, genes_up, genes_down, maxFC)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() + 
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
         scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + 
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, logFC, fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Boxplots for normalised data

# Normalisation
LumiBatch_v = lumiN(LumiBatch_v, method='rsn')
## Perform rsn normalization ...
## 2019-05-08 15:46:34 , processing array  1 
## 2019-05-08 15:46:34 , processing array  2 
## 2019-05-08 15:46:34 , processing array  3 
## 2019-05-08 15:46:34 , processing array  4 
## 2019-05-08 15:46:34 , processing array  5 
## 2019-05-08 15:46:34 , processing array  6 
## 2019-05-08 15:46:35 , processing array  7 
## 2019-05-08 15:46:35 , processing array  8 
## 2019-05-08 15:46:35 , processing array  9 
## 2019-05-08 15:46:35 , processing array  10 
## 2019-05-08 15:46:35 , processing array  11 
## 2019-05-08 15:46:35 , processing array  12 
## 2019-05-08 15:46:35 , processing array  13 
## 2019-05-08 15:46:35 , processing array  14 
## 2019-05-08 15:46:35 , processing array  15 
## 2019-05-08 15:46:35 , processing array  16 
## 2019-05-08 15:46:35 , processing array  17 
## 2019-05-08 15:46:36 , processing array  18 
## 2019-05-08 15:46:36 , processing array  19 
## 2019-05-08 15:46:36 , processing array  20 
## 2019-05-08 15:46:36 , processing array  21 
## 2019-05-08 15:46:36 , processing array  22 
## 2019-05-08 15:46:36 , processing array  23 
## 2019-05-08 15:46:36 , processing array  24 
## 2019-05-08 15:46:36 , processing array  25 
## 2019-05-08 15:46:36 , processing array  26 
## 2019-05-08 15:46:36 , processing array  27 
## 2019-05-08 15:46:36 , processing array  28 
## 2019-05-08 15:46:36 , processing array  29 
## 2019-05-08 15:46:37 , processing array  30 
## 2019-05-08 15:46:37 , processing array  31 
## 2019-05-08 15:46:37 , processing array  32 
## 2019-05-08 15:46:37 , processing array  33 
## 2019-05-08 15:46:37 , processing array  34 
## 2019-05-08 15:46:37 , processing array  35 
## 2019-05-08 15:46:37 , processing array  36 
## 2019-05-08 15:46:37 , processing array  37 
## 2019-05-08 15:46:37 , processing array  38 
## 2019-05-08 15:46:38 , processing array  39 
## 2019-05-08 15:46:38 , processing array  40 
## 2019-05-08 15:46:38 , processing array  41 
## 2019-05-08 15:46:38 , processing array  42 
## 2019-05-08 15:46:38 , processing array  43 
## 2019-05-08 15:46:38 , processing array  44 
## 2019-05-08 15:46:38 , processing array  45 
## 2019-05-08 15:46:38 , processing array  46 
## 2019-05-08 15:46:38 , processing array  47 
## 2019-05-08 15:46:38 , processing array  48 
## 2019-05-08 15:46:38 , processing array  49 
## 2019-05-08 15:46:38 , processing array  50 
## 2019-05-08 15:46:39 , processing array  51 
## 2019-05-08 15:46:39 , processing array  52 
## 2019-05-08 15:46:39 , processing array  53 
## 2019-05-08 15:46:39 , processing array  54 
## 2019-05-08 15:46:39 , processing array  55 
## 2019-05-08 15:46:39 , processing array  56 
## 2019-05-08 15:46:39 , processing array  57 
## 2019-05-08 15:46:39 , processing array  58
datExpr = exprs(LumiBatch_v)

# Perform Differential Expression Analysis
SAM_fit = SAM(datExpr, as.factor(datMeta$Disease.status), resp.type = 'Two class unpaired', 
              geneid = rownames(datExpr), random.seed = 1234, logged2 = TRUE, fdr.output = 1)
## perm= 1
## perm= 2
## perm= 3
## perm= 4
## perm= 5
## perm= 6
## perm= 7
## perm= 8
## perm= 9
## perm= 10
## perm= 11
## perm= 12
## perm= 13
## perm= 14
## perm= 15
## perm= 16
## perm= 17
## perm= 18
## perm= 19
## perm= 20
## perm= 21
## perm= 22
## perm= 23
## perm= 24
## perm= 25
## perm= 26
## perm= 27
## perm= 28
## perm= 29
## perm= 30
## perm= 31
## perm= 32
## perm= 33
## perm= 34
## perm= 35
## perm= 36
## perm= 37
## perm= 38
## perm= 39
## perm= 40
## perm= 41
## perm= 42
## perm= 43
## perm= 44
## perm= 45
## perm= 46
## perm= 47
## perm= 48
## perm= 49
## perm= 50
## perm= 51
## perm= 52
## perm= 53
## perm= 54
## perm= 55
## perm= 56
## perm= 57
## perm= 58
## perm= 59
## perm= 60
## perm= 61
## perm= 62
## perm= 63
## perm= 64
## perm= 65
## perm= 66
## perm= 67
## perm= 68
## perm= 69
## perm= 70
## perm= 71
## perm= 72
## perm= 73
## perm= 74
## perm= 75
## perm= 76
## perm= 77
## perm= 78
## perm= 79
## perm= 80
## perm= 81
## perm= 82
## perm= 83
## perm= 84
## perm= 85
## perm= 86
## perm= 87
## perm= 88
## perm= 89
## perm= 90
## perm= 91
## perm= 92
## perm= 93
## perm= 94
## perm= 95
## perm= 96
## perm= 97
## perm= 98
## perm= 99
## perm= 100
## 
## Computing delta table
## 1
## 2
## 3
## 4
## 5
## 6
## 7
## 8
## 9
## 10
## 11
## 12
## 13
## 14
## 15
## 16
## 17
## 18
## 19
## 20
## 21
## 22
## 23
## 24
## 25
## 26
## 27
## 28
## 29
## 30
## 31
## 32
## 33
## 34
## 35
## 36
## 37
## 38
## 39
## 40
## 41
## 42
## 43
## 44
## 45
## 46
## 47
## 48
## 49
## 50
genes_up = SAM_fit$siggenes.table$genes.up %>% data.frame
genes_down = SAM_fit$siggenes.table$genes.lo %>% data.frame
DE_info = genes_up %>% bind_rows(genes_down)

maxFC = max(as.numeric(DE_info$Fold.Change[DE_info$Fold.Change!='Inf']))
DE_info = DE_info %>% mutate('logFC'=ifelse(Fold.Change=='Inf', log2(maxFC+1), 
                                            log2(as.numeric(Fold.Change)+1)),
                             'ID' = Gene.Name)

rm(SAM_fit, genes_up, genes_down, maxFC)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() + 
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
         scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + 
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, logFC, fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)

Batch correction

# Remove single batches 
datMeta$Chip = sapply(datMeta$chip_array, function(x) substr(x,1,nchar(x)-2))
to_remove = datMeta$Chip == '4936551002'
datExpr = datExpr[,!to_remove]
datMeta = datMeta[!to_remove,]

# Batch Correction
mod = model.matrix(~Diagnosis_, data=datMeta)
batch = factor(datMeta$Chip)
datExpr = ComBat(datExpr, batch=batch, mod=mod)
## Standardizing Data across genes
rm(to_remove, mod, batch)
ASD_vs_CTL = make_ASD_vs_CTL_df(datExpr)

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, mean, fill=Group)) + 
              geom_boxplot() + theme_minimal() + 
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, sd, fill=Group)) + 
              geom_boxplot() + theme_minimal() +
              ggtitle('Mean (left) and Standard Deviation (right) by score') +
              scale_fill_manual(values=gg_colour_hue(9)) +
              theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)
ggplot(ASD_vs_CTL, aes(mean, sd)) + geom_point(alpha=0.1, aes(id=ID, fill=Group, color=Group)) +
         scale_color_manual(values=gg_colour_hue(9)) + scale_fill_manual(values=gg_colour_hue(9)) +  
         scale_x_continuous(trans='log2') + scale_y_continuous(trans='log2') + 
           ggtitle(paste0('Corr=',round(cor(ASD_vs_CTL$mean, ASD_vs_CTL$sd), 2))) +
         geom_smooth(method=lm, se=FALSE, color='#808080', size=0.5) + theme_minimal()

Difference between Diagnosis by score

p1 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, abs(mean_diff), fill=Group)) + 
                geom_boxplot() + theme_minimal() + 
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

p2 = ggplotly(ggplot(ASD_vs_CTL, aes(Group, logFC, fill=Group)) + 
                geom_boxplot() + theme_minimal() +
                ggtitle('Difference in mean (left) and log Fold Change (right) by Diagnosis') +
                scale_fill_manual(values=gg_colour_hue(9)) +
                theme(legend.position='none', axis.text.x=element_text(angle=45, hjust=1)))

subplot(p1, p2, nrows=1)
rm(p1, p2)